/* Copyright (C) 2000-2002 Lavtech.com corp. All rights reserved.

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
*/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <sys/types.h>
#include <dirent.h>

#include "udm_config.h"

#include "udm_common.h"
#include "udm_crc32.h"
#include "udm_guesser.h"

#include "udm_utils.h"
#include "udm_unicode.h"
#include "udm_log.h"
#include "udm_vars.h"

/*#define DEBUG_GUESSER*/

static UDM_LANGMAP *FindLangMap(UDM_ENV *Env, char *lang, char *charset) {
  UDM_LANGMAP *o = NULL;
  register int i;

  for(i = 0; i < Env->LangMaps.nmaps; i++) {
    if(
       (!strcmp(Env->LangMaps.Map[i].charset, charset))
       &&
       (!strcmp(Env->LangMaps.Map[i].lang, lang))
       ) {
      return &Env->LangMaps.Map[i];
    }
  }

  if(Env->LangMaps.nmaps == 0){
    o = Env->LangMaps.Map=(UDM_LANGMAP*)malloc(sizeof(UDM_LANGMAP));
  }else{
    Env->LangMaps.Map=(UDM_LANGMAP*)realloc(Env->LangMaps.Map,(Env->LangMaps.nmaps+1)*sizeof(UDM_LANGMAP));
    o = &Env->LangMaps.Map[Env->LangMaps.nmaps];
  }
  bzero(o, sizeof(UDM_LANGMAP));
  o->charset = strdup(charset);
  o->lang = strdup(lang);
  Env->LangMaps.nmaps++;
  return o;
  
}

int UdmLoadLangMapFile(UDM_ENV * Env, const char * filename){
	FILE * f;
	char str[1000];
	char *Ccharset = NULL, *Clanguage = NULL;
	UDM_LANGMAP *Cmap = NULL;
	UDM_CHARSET * cs;

	f=fopen(filename,"r");
	if(!f){
		Env->errcode=1;
		sprintf(Env->errstr,"Can't open LangMapFile '%s'\n",filename);
		return -1;
	}
	while(fgets(str,sizeof(str),f)){
		if(str[0]=='#'||str[0]==' '||str[0]=='\t')continue;

		if(!strncmp(str,"Charset:",8)){
			
			char * charset, * lasttok;
			UDM_FREE(Ccharset);
			if((charset=strtok_r(str+8," \t\n\r",&lasttok))){
				Ccharset = strdup(charset);
			}
		}else
		if(!strncmp(str,"Language:",9)){
			char * lang, *lasttok;
			UDM_FREE(Clanguage);
			if((lang=strtok_r(str+9," \t\n\r",&lasttok))){
				Clanguage = strdup(lang);
			}
		}else{
			char *s;
			int count;
			
			if(!(s=strchr(str,'\t')))continue;
			if ((Ccharset != NULL) && (Clanguage != NULL)) Cmap = FindLangMap(Env, Clanguage, Ccharset);
			*s='\0';

			if(((count=atoi(s+1))==0)||(strlen(str)<1)||(strlen(str)>UDM_LM_MAXGRAM))
				continue;

			for(s=str;*s;s++){
				if(*s=='_')*s=' ';
			}
			if(*str){
				register int hindex;
				hindex = ((unsigned int)UdmCRC32(str,strlen(str))) & UDM_LM_HASHMASK;
				Cmap->memb[hindex].count += count;
				/*
				strcpy(Env->LangMaps.Map[Env->LangMaps.nmaps-1].memb[hindex].str,str);
				*/
			}
		}
	}
	fclose(f);
	UDM_FREE(Clanguage);
	UDM_FREE(Ccharset);
	
	if(!Env->LangMaps.Map[Env->LangMaps.nmaps-1].lang){
		Env->errcode=1;
		sprintf(Env->errstr,"No language definition in LangMapFile '%s'\n",filename);
		return -1;
	}

	if(!Env->LangMaps.Map[Env->LangMaps.nmaps-1].charset){
		Env->errcode=1;
		sprintf(Env->errstr,"No charset definition in LangMapFile '%s'\n",filename);
		return -1;
	}
	if(!(cs=UdmGetCharSet(Env->LangMaps.Map[Env->LangMaps.nmaps-1].charset))){
		Env->errcode=1;
		sprintf(Env->errstr,"Unknown charset '%s' in LangMapFile '%s'\n",
			Env->LangMaps.Map[Env->LangMaps.nmaps-1].charset,filename);
		return -1;
	}
	UdmPrepareLangMap(&Env->LangMaps.Map[Env->LangMaps.nmaps-1]);
	return 0;
}


void UdmLangMapListFree(UDM_LANGMAPLIST *List){
	size_t i;
	
	for(i=0;i<List->nmaps;i++){
		UDM_FREE(List->Map[i].charset);
		UDM_FREE(List->Map[i].lang);
	}
	UDM_FREE(List->Map);
	List->nmaps = 0;
}


void UdmBuildLangMap(UDM_LANGMAP * map,const char * text,size_t textlen, int StrFlag){
	const char * end=text+textlen;
	int prevb=' ';

	for(;text<=end;text++){
		char buf[UDM_LM_MAXGRAM+3];
		size_t buflen=0;
		const char * t;
		int code;
		int prev=0;

		code=(unsigned char)(*text);
		if(code<' ')continue;
		if((code==' ')&&(prevb==' '))continue;
		prevb=code;

		t=text;
		for(buflen=0;buflen<UDM_LM_MAXGRAM;buflen++){
			int hindex;

			for(;t<=end;t++){
				code=(unsigned char)(*t);
				if(code<' ')continue;
				if((code==' ')&&(prev==' '))continue;
				prev=code;
				break;
			}
			if(t>end)break;
			t++;

			buf[buflen]=code;
			buf[buflen+1]='\0';

			hindex = UdmCRC32(buf,buflen+1);
			hindex=((unsigned int)hindex ) & UDM_LM_HASHMASK;
			map->memb[hindex].count++;

#ifdef DEBUG_GUESSER
			/* Print collision */
			if(map->memb[hindex].str[0]){
				int res;
				res=strcmp(map->memb[hindex].str,buf);
				if(res){
					printf("Coll %04X '%s' '%s'\n",hindex,map->memb[hindex].str,buf);
					strcpy(map->memb[hindex].str,buf);
				}
			}
#endif
			if (StrFlag) strcpy(map->memb[hindex].str, buf);
		}
	}
}


void UdmPrepareLangMap(UDM_LANGMAP * map){
	register int i;
	int nmemb=UDM_LM_HASHMASK+1;
	register float expectation=0;

	for(i=0;i<nmemb;i++){
		expectation+=map->memb[i].count;
	}
	map->expectation = expectation;
	for(i = 0; i < nmemb; i++){
	  map->memb[i].p = (map->memb[i].count) ? (map->memb[i].count) / (expectation) : 0.0000001;
	}
}


float UdmCheckLangMap(UDM_LANGMAP * map0,UDM_LANGMAP * map1){
	register float res=0.0;
	register int nmemb = UDM_LM_HASHMASK + 1;
	register int i;
	register UDM_LANGITEM * mp0, *mp1;
#define	  P  (mp0->p)
#define   Q  (mp1->p)

	for(i=0,mp0=map0->memb,mp1=map1->memb;i<nmemb;i++,mp0++,mp1++){
	  register float t = log(P / Q);
	  res += t * t;
/*	  res += log(P / Q) * log( P / Q );*/
	}
	return res;
}


/* Structure to sort guesser results */
typedef struct {
	UDM_LANGMAP * map;
	float quality;
} UDM_MAPSTAT;

static int statcmp(const void * i1, const void * i2){
	float fres;
	fres = ((const UDM_MAPSTAT*)(i2))->quality - ((const UDM_MAPSTAT*)(i1))->quality;
	if(fres<0)return +1;
	if(fres>0)return -1;
	return 0;
}

int  UdmGuessCharSet(UDM_DOCUMENT * Doc,UDM_LANGMAPLIST *List,UDM_LANGMAP *LangMap){
	int i;
	UDM_MAPSTAT * mapstat;
	
	/* 
		TODO for Guesser
		
		There are three sources to detect charset:
		1. HTTP header:  Content-Type: ... charset=XXX
		2. <META HTTP-EQUIV="Content-Type" Contnet="... charset=YYY">
		3. ZZZ[i] - array of guessed charsets in mapstat[]
		good(ZZZ[n]) means that guesser returned good results for n.

		Now we may have various combinations:
		Simple situations, non-guessed and guessed charsets
		seem to be the same. At least one of non-guessed
		charset is the same with the best guessed charset
		and guessed charset gave good results:

		1. XXX==YYY==ZZZ[0],      good(ZZZ[0]). Take XXX value.
		2. No YYY, XXX=ZZZ[0] and good(ZZZ[0]). Take XXX value.
		3. No XXX, YYY=ZZZ[0] and good(ZZZ[0]). Take YYY value.
		4. XXX<>YYY, XXX==ZZZ[0], good(ZZZ[0]). Take XXX value.
		5. XXX<>YYY, YYY==ZZZ[0], good(ZZZ[0]). Take XXX value.
			4 and 5 seem to be webmaster mistake.

		There are the same fith situations when ZZZ[x] is still good 
		enough, but it is not the best one, i.e. x>0 
		Choose charset in the same way.
	*/

	/* Prepare document langmap */
	UdmPrepareLangMap(LangMap);

	/* Allocate memory for comparison statistics */
	mapstat=(UDM_MAPSTAT *)malloc(List->nmaps*sizeof(UDM_MAPSTAT));
	for(i=0;i<List->nmaps;i++){
		mapstat[i].quality=UdmCheckLangMap(&List->Map[i],LangMap);
		mapstat[i].map=&List->Map[i];
	}

	/* Sort statistics in quality order */
	qsort(mapstat,List->nmaps,sizeof(UDM_MAPSTAT),&statcmp);

	/* Display results, best language is shown first */
	for(i=0;i<List->nmaps;i++){
		const char *s;
		
		if(!(s=UdmVarListFindStr(&Doc->Sections,"Content-Language",NULL))){
			if(mapstat[i].map->lang){
				UdmVarListAddStr(&Doc->Sections,"Content-Language",mapstat[i].map->lang);
			}
		}
		if(!(s=UdmVarListFindStr(&Doc->Sections,"Charset",NULL))){
			if (mapstat[i].map->charset) {
				UdmVarListAddStr(&Doc->Sections,"Charset",mapstat[i].map->charset);
			}
		}
		if (i > 8) break;
		/*
		UdmLog(Indexer,UDM_LOG_DEBUG,"Guesser: %.5f %s-%s",mapstat[i].quality,mapstat[i].map->lang,mapstat[i].map->charset);
		fprintf(stderr, "Guesser: %.5f %s-%s\n",mapstat[i].quality,mapstat[i].map->lang,mapstat[i].map->charset);
		*/
	}
	/*
	if(!Doc->lang){
	  for(i = 0; i < List->nmaps && (!Doc->lang); i++)
	    if(mapstat[i].map->lang){
	      Doc->lang = strdup(mapstat[i].map->lang);
	    }
	}
	if(!Doc->charset){
	  for(i = 0; i < List->nmaps && (!Doc->charset); i++)
	    if (mapstat[i].map->charset && (!strcmp(Doc->lang, mapstat[i].map->lang))) {
	      Doc->charset = strdup(mapstat[i].map->charset);
	    }
	}
	*/
	free(mapstat);
	return 0;
}
